We compare the genomic offset predictions from the four following methods:
Generalized dissimilarity modelling (GDM). See report 9_GeneralizedDissimilarityModelling_GenomicOffsetPredictions.html.
Latent factor mixed model (LFMM) . See report 7_LFMM_IdentificationCandidateSNPs_GenomicOffsetPredictions.html.
Gradient Forest (GF). See report 10_GradientForest_GenomicOffsetPredictions.html.
Redundancy analysis (RDA). See report 11_RedundancyAnalysis_GenomicOffsetPredictions.html.
Code
# Method namesmeth_names <-c("GDM","LFMM","GF","RDA")# population namespop_names <-readRDS(here("data/GenomicData/MainGenePoolPopulations.rds")) %>%arrange(pop) %>%pull(pop)# SNP setssnp_sets <-readRDS(here("outputs/list_snp_sets.rds"))
Comparing population ranks
In this section, we compare GO predictions from the different methods (GF, GDM, RDA and LFMM), SNP sets (candidates, control and all SNPs for LFMM) and GCMs. For that, for each combination of method/SNP set/GCM, we ranked the populations based on the GO predictions, i.e. populations with the lowest rank had the highest GO predictions.
Across methods/SNP sets
We first compared the population ranks from the different combinations method/SNP set. For that, we generated one plot per GCM. We also generated a plot in which the population ranks are not based on an unique GCM but are based on the average GO predictions across all GCMs. In the plots below, we colored the populations that were among the three populations with the highest genomic offset in at least one combination method/SNP set.
Code
# Ranking the populations for each combination method/SNP set/GCM# ===============================================================go_df <-lapply(meth_names, function(meth_name) {if(meth_name=="LFMM"){ go_predictions <-readRDS(file=here(paste0("outputs/LFMM/go_predictions_snpsets.rds")))} else { go_predictions <-readRDS(file=here(paste0("outputs/",meth_name,"/go_predictions.rds"))) }go_predictions <- go_predictions %>%lapply(function(snp_set) { snp_set$go %>%lapply(function(gcm){tibble(pop = pop_names,go = gcm) %>%arrange(go) %>%mutate(rank =rev(1:length(pop_names))) }) %>%bind_rows(.id="gcm") }) %>%bind_rows(.id="snp_set")if(meth_name=="LFMM"){go_predictions <-readRDS(file=here(paste0("outputs/LFMM/go_predictions_allsnps.rds")))$go %>%lapply(function(gcm){tibble(pop = pop_names,go = gcm,snp_set ="all") %>%arrange(go) %>%mutate(rank =rev(1:length(pop_names))) }) %>%bind_rows(.id="gcm") %>%bind_rows(go_predictions) } go_predictions}) %>%setNames(meth_names) %>%bind_rows(.id="method")# ============================================================================================================# ============================================================================================================# For each combination of method (GF, GDM, LFMM and RDA) and SNP set (candidates, control and all SNPs for LFMM), # we calculate the average of the GO predictions across the five GCMs, # and then we rank the populations based on their GO predicted value.# First option (that we should use I think): take the mean of the GO predictionsgo_df <- go_df %>%group_by(pop,method,snp_set) %>%summarise(go=mean(go)) %>%ungroup() %>%group_split(method,snp_set) %>%lapply(function(x){ x %>%arrange(go) %>%mutate(rank =rev(1:length(pop_names)),gcm ="GCMs_average") }) %>%bind_rows() %>%bind_rows(go_df)# Another option (the first I use, but I think we should use the option above): take the mean of the ranks# go_df %>%# group_by(pop,method,snp_set) %>%# summarise(mean_rank=mean(rank),go=mean(go)) %>%# ungroup() %>%# group_split(method,snp_set) %>%# lapply(function(x){# x %>%# arrange(mean_rank) %>%# mutate(rank = 1:length(pop_names),# gcm = "GCM average") %>%# dplyr::select(-mean_rank)# }) %>%# bind_rows() %>%# bind_rows(go_df)# =====================================# =====================================# FIGURE TITLE AND LABELS# We create a vector that we will use for the x-axis labels and the plot titlemethod_snpset_names <-c("GDM (candidates)","GDM (control)","GF (candidates)","GF (control)","LFMM (all SNPs)","LFMM (candidates)","LFMM (control)","RDA (candidates)","RDA (control)","Average across methods and SNP sets")names(method_snpset_names) <- go_df %>%filter(! (snp_set =="cand_corrected")) %>%mutate(meth_snpset =paste0(method, "_", snp_set)) %>%pull(meth_snpset) %>%unique() %>%c(.,"method_snpset_average")
Code
# =================================# Code the generate the bump charts# =================================# the colors I will use to color the populations with high GOmy_colors <-c(brewer.pal(n=12, "Paired"),"#FF40EE")bump_charts <- go_df %>%filter(! (snp_set =="cand_corrected")) %>%mutate(x_axis = method_snpset_names[paste0(method, "_", snp_set)]) %>%group_split(gcm) %>%lapply(function(x){high_go_pops <- x %>%filter(rank <4) %>%pull(pop) %>%unique() my_palette <-c(my_colors[1:length(high_go_pops)],"#E8E8E8")if(unique(x$gcm)=="GCMs_average"){ plot_title <-"Average across the five GCMs"} else { plot_title <-unique(x$gcm)}sub <- x %>%mutate(flag =ifelse(pop %in% high_go_pops, TRUE, FALSE),pop_col =if_else(flag ==TRUE, pop, "Others")) %>%mutate(pop =factor(pop, levels=c(setdiff(unique(x$pop),high_go_pops),high_go_pops)),pop_col =factor(pop_col, levels =c(high_go_pops,"Others")))p <- sub %>%ggplot(aes(x = x_axis, y = rank, group = pop)) +geom_point(aes(color = pop_col), size =2, alpha =0.9) +geom_line(aes(color = pop_col), linewidth =2, alpha =0.8) +scale_y_reverse(breaks =1:nrow(sub)) +scale_color_manual(values = my_palette) +geom_text(data = sub %>%filter(x_axis == sub$x_axis[[length(sub$x_axis)]] & pop %in% high_go_pops),aes(label = pop, x =9.1) , hjust =0.15, color ="gray20", size =3) +#color = "#888888",theme_bw() +ylab("Population rank") +ggtitle(plot_title) +theme(panel.grid.minor.y =element_blank(),panel.grid.major.y =element_blank(),panel.grid.major.x =element_blank(),legend.position ="none",axis.title.x =element_blank())ggsave(p, filename =here(paste0("figs/PredictionVariability/BumpChart_PopRank_",unique(x$gcm),".pdf")), device ="pdf", width=12)if(unique(x$gcm)=="GCMs_average"){ p_manuscript <- p +ggtitle("") ggsave(p_manuscript, filename =here(paste0("figs/PredictionVariability/BumpChart_PopRank_",unique(x$gcm),"_notitle.pdf")), device ="pdf", width=12)} p })pdf(here("figs/PredictionVariability/BumpCharts_PerMethSnpSet.pdf"), width=12)lapply(bump_charts, function(x) x)dev.off()bump_charts
Across GCMs
We then compared the population ranks from the different GCMs (or from the average GO predictions across the five GCMs). For that, we generated one plot per combination method/SNP set. We also generated a plot in which the population ranks are not based on an unique combination method/SNP set but are based on the average GO predictions across all combinations method/SNP set.
In the plots below, we colored the populations that were among the three populations with the highest genomic offset in at least one GCM.